## master preamble ------------------------------------------------------
rm(list=ls(all=TRUE)) 

## inherit directories from stata -----------
args = commandArgs(trailingOnly = "TRUE")
if (length(args)) {
  dir.proj = paste0(args[1],'/')
  dir.lib  = args[2]
} 

## if loading from library -------------------
if(dir.lib!='') {
  ## load depenencies --------------------
  library(R.utils,lib.loc=dir.lib)
  library(ggplot2,lib.loc=dir.lib)
  library(tidyr,lib.loc=dir.lib)
  library(stringr,lib.loc=dir.lib)
  library(forcats,lib.loc=dir.lib)
  library(lubridate,lib.loc=dir.lib)
  library(haven,lib.loc=dir.lib)
  library(data.table,lib.loc=dir.lib)
  ## load main packages ------------------
  library(rio,lib.loc=dir.lib)
  library(dplyr,lib.loc=dir.lib)
  library(tidyverse,lib.loc=dir.lib)
  library(fixest,lib.loc=dir.lib)
## otherwise: simple load -----------------
} else {
  library(rio)
  library(dplyr)
  library(tidyverse)
  library(fixest)
}
## --------------------------------------------------------------------

## file preamble ------------------------------------------------------
## randomization seed --------
set.seed(413)

## directories --------
dir.data = paste0(dir.proj,'data/out/')
dir.est  = paste0(dir.proj,'estimates/')
dir.temp = paste0(dir.proj,'_temp/')

## file-specific -------
name.Y   = 'cite_ny1'
name.out = 'recid_spc'
bootrep  = 250

## functions ----------------
rudirichlet <- function(n, d) {
  rexp.mat <- matrix( rexp(d * n, 1) , nrow = n, ncol = d)
  dirichlet.weights <- rexp.mat / rowSums(rexp.mat)
  dirichlet.weights }
## ---------------------------------------------------------------------


## Setup data -------------------
data = import(paste0(dir.data,'4-main.dta'))
data = mutate(
  data,
  Y=data[[name.Y]],D=harsh,W=covbin1,jid=officerid) %>%
  select(citationid,Y,D,Z,W,totfe,jid,countynum,highway,wknd,shift,year,month)


## Additional setup: SPC approach -----
data = mutate(
  data,
  loc     = countynum,
  time    = paste(wknd,shift,sep='_'),
  loctime = paste(loc,time,sep='_'),
  jid_loctime = paste(jid,loctime,sep='_'),
  ym = paste(year,month,sep='_'),
  ym_loc = paste(ym,loc,sep='_'),
  cov_loc = paste(highway,loc,sep='_'))



##########################
## Sample restrictions -------------
## 100 per j, 100 per loctime, 25 per j-loctime, 10 j's per loctime 
temp = data %>% group_by(jid) %>% summarize(N=n())
data = inner_join(data,temp %>% filter(N>=100) %>% select(jid))
temp = data %>% group_by(loctime) %>% summarize(N=n())
data = inner_join(data,temp %>% filter(N>=100) %>% select(loctime))
temp = data %>% group_by(jid_loctime) %>% summarize(N=n())
data = inner_join(data,temp %>% filter(N>=25) %>% select(jid_loctime))
temp = data %>% select(loctime,jid) %>% unique() %>% group_by(loctime) %>% summarize(N=n())
data = inner_join(data,temp %>% filter(N>=10) %>% select(loctime))


## Setup for clustered bootstrap -------------------
list    = unique(select(data,jid))
weights = rudirichlet(nrow(list),bootrep)*10


## Looping -----------
for(j in 0:bootrep) {
  
  ## Setup bootstrap data --------
  if(j==0) {
    boot.list = list %>% mutate(bootwt = 1)
    boot.data = inner_join(data,boot.list,by='jid') }
  if(j>=1) {
    boot.list = list %>% mutate(bootwt = weights[,j])
    boot.data = inner_join(data,boot.list,by='jid') }
  
  ## Get sample means ------------
  y = weighted.mean(boot.data$Y,boot.data$bootwt)
  p = weighted.mean(boot.data$D,boot.data$bootwt)
  
  ## Other pars ------------
  fit = feols(Y~D,boot.data,weights=~bootwt)
  y0d0 = as.numeric(fit$coefficients[1])
  y1d1 = as.numeric(fit$coefficients[1]+fit$coefficients[2])
  
  ## Get weights --------
  wts = boot.data %>% group_by(jid,loc) %>%
    summarize(nwt=sum(bootwt))
  
  ## Compute the tildes ---------
  dv.list = c('D','Y')
  for(dv in dv.list) {
    
    ## Regression fit ------
    fit = fixest::feols(
      as.formula(glue::glue(dv,'~0|jid_loctime+ym_loc+cov_loc')),
      boot.data,weights=~bootwt)
    
    ## Store FE ests -------
    fe  = bind_cols(
      boot.data %>% select(jid,loc,loctime),
      predict(fit,fixef=T)) %>% 
      mutate(phi=jid_loctime,delta=ym_loc,gamma=cov_loc)
    
    ## Avg FE -----------
    fe = left_join(
      fe,fe %>% group_by(loctime) %>% 
        summarize(mugamma=mean(gamma,na.rm=T),mudelta=mean(delta,na.rm=T)),by='loctime')
    
    ## In-location weights ----
    fe = fe %>%
      left_join(fe %>% group_by(loc) %>% summarize(Nloc=n()),by='loc') %>%
      left_join(fe %>% group_by(loctime) %>% summarize(Nloctime=n()),by='loctime') %>%
      mutate(tauwt=Nloctime/Nloc)
    
    ## Collapse to officer-locations -----
    fe  = mutate(fe,tilde=phi+mugamma+mudelta)
    til = fe %>% group_by(jid,loc) %>% summarize(ttil=weighted.mean(tilde,tauwt)) %>%
      mutate(til=case_when(ttil<0~0,ttil>=0&ttil<=1~ttil,ttil>=1~1)) %>% select(-c(ttil))
    assign(paste0('til',dv),til)
  }
  
  ## Put together -------
  til = inner_join(
    tilD %>% mutate(Dtil=til) %>% select(-c(til)),
    tilY %>% mutate(Ytil=til) %>% select(-c(til))) %>% 
    inner_join(wts)
  
  ## Store tilde if first iteration ------
  if(j==0) {
    export(til,paste0(dir.temp,'spc_tilde.csv')) }
  
  
  ########################
  ### Polynomial fits ---------------------
  
  ## Polynomials ---------
  til = mutate(
    til,
    Z1=Dtil^1,Z2=Dtil^2,Z3=Dtil^3,Z4=Dtil^4,
    Z5=Dtil^5,Z6=Dtil^6,Z7=Dtil^7,Z8=Dtil^8)
  
  ## Loop over polynomial order ------
  q.list = c(2,3,4,8)
  for(q in q.list) {
    
    ## Polynomial fit -----
    fit = fixest::feols(
      as.formula(glue::glue('Ytil~',paste(paste0('Z',seq(1:q)),collapse='+'),'|loc')),
      til,weights=~nwt)
    #print(summary(fit)) 
    
    ## Compute pars ----
    fe = stats::predict(fit,newdata=til,fixef=T)
    y0 = weighted.mean(fe$loc,til$nwt)
    y1 = y0+sum(fit$coefficients)
    
    ## Post output -----
    est = data.frame(
      par=c('Y','p','Y0','Y1','ATE','ATT','ATU',
            'Y1D1','Y1D0','Y0D0','Y0D1','Y0D1mY0D0','ATTmATU'),
      est=c(y,p,y0,y1,(y1-y0),(y-y0)/p,(y1-y)/(1-p),
            y1d1,(1/(1-p))*y1-(p/(1-p))*y1d1,
            y0d0,(1/p)*y0-((1-p)/p)*y0d0,
            ((1/p)*y0-((1-p)/p)*y0d0)-y0d0,
            ((y-y0)/p)-(y1-y)/(1-p))) %>% mutate(iter=j,poly=q)
    if(j==0 & q==q.list[1]) {
      boot.out = est }
    else { boot.out = bind_rows(boot.out,est) }
  }
  
  
  #####################
  ### Add simple LATE -----------
  fit = feols(Ytil~Dtil|loc,til,weights=~nwt)
  boot.out = bind_rows(
    boot.out,
    data.frame(
      par=c('slope'),est=c(as.numeric(fit$coefficients))) %>%
      mutate(iter=j,poly=1))
  
  
  ## Progress report -----
  print(paste('Iteration',j,'of',bootrep,'Completed'))
  
}
## End of bootstrap -------------------------


## Bootstrap output -----------
est.main = boot.out %>% filter(iter==0) %>% select(poly,par,est)
est.boot = boot.out %>% filter(iter>=1) %>%
  group_by(poly,par) %>% summarize(se=sd(est),lq=quantile(est,0.05),uq=quantile(est,0.95))
est.out  = inner_join(est.main,est.boot) %>% mutate(lb=est-1.96*se,ub=est+1.96*se)
export(est.out,paste0(dir.est,name.out,'.csv'))


